install.packages(c("tidyverse", "dplyr","countrycode"), repos = "http://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/8b/ht4__40d6qdcxj1qqfwjdb9h0000gn/T//Rtmpkvvml0/downloaded_packages
library(dplyr)
library(tidyverse)
library(countrycode)
df <- read.csv("Cost_of_Living_Index_2022.csv", header=TRUE)
# The datatype of the dataframe could probably be factor type which is why header=TRUE was not sufficient to place the first row into column names.
colnames(df) <- df[1,]
df <- df[-1,]
View(df)
# Let's check the summary
summary(df)
## Rank Country Cost of Living Index Rent Index
## Length:139 Length:139 Length:139 Length:139
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Cost of Living Plus Rent Index Groceries Index Restaurant Price Index
## Length:139 Length:139 Length:139
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## Local Purchasing Power Index
## Length:139
## Class :character
## Mode :character
str(df)
## 'data.frame': 139 obs. of 8 variables:
## $ Rank : chr "1" "2" "3" "4" ...
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Argentina" ...
## $ Cost of Living Index : chr "20.37" "35.5" "26.87" "34.69" ...
## $ Rent Index : chr "2.72" "8.47" "4.59" "7.71" ...
## $ Cost of Living Plus Rent Index: chr "12.09" "22.83" "16.43" "22.04" ...
## $ Groceries Index : chr "14.92" "29.32" "28.82" "28.17" ...
## $ Restaurant Price Index : chr "12.41" "25.82" "14.48" "33.32" ...
## $ Local Purchasing Power Index : chr "23.04" "30.19" "24.63" "30.72" ...
any(is.na(df))
## [1] FALSE
df = df %>% mutate(across(.cols=c(1,3:8), .fns=as.numeric))
df <- df[-1]
str(df)
## 'data.frame': 139 obs. of 7 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Argentina" ...
## $ Cost of Living Index : num 20.4 35.5 26.9 34.7 33.9 ...
## $ Rent Index : num 2.72 8.47 4.59 7.71 11.61 ...
## $ Cost of Living Plus Rent Index: num 12.1 22.8 16.4 22 23.4 ...
## $ Groceries Index : num 14.9 29.3 28.8 28.2 27.6 ...
## $ Restaurant Price Index : num 12.4 25.8 14.5 33.3 30.6 ...
## $ Local Purchasing Power Index : num 23 30.2 24.6 30.7 28.9 ...
1. Cost of living index - Let’s first visualize the distribution of the outcome variable
options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(df, aes(x=`Cost of Living Index`))+
geom_histogram(aes(y=..density..), position="identity", fill="#E69F00", color="#E69F00",alpha=0.6)+
geom_density(fill= "#E69F00", alpha=0.3, color="#E69F00")+
geom_vline(xintercept = mean(df$`Cost of Living Index`), linetype= "dashed", color="red")+
xlim(0,100)+
labs(title="Cost of Living Index Distribution", y="Density")+
theme(text=element_text(size=14))
All Indices (Rent, Grocery, Purchase Power, Restaurant Price) - Let’s take a look at the distribution curve of all numerical variables.
options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
keep(is.numeric)%>%
gather()%>%
ggplot(aes(value, fill=value))+
geom_histogram(aes(y=..density..), position="identity",color="blue2", fill="cornflowerblue", alpha=0.9)+
geom_density(color="blue")+
facet_wrap(~ key, scales ="free", ncol=2)+
theme(text=element_text(size=14))
Top 10 countries with the highest cost of living index
options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
arrange(desc(df$`Cost of Living Index`)) %>%
slice(1:10) %>%
ggplot(., aes(x=reorder(Country, -`Cost of Living Index`),y=`Cost of Living Index`))+
geom_bar(stat='identity', color="skyblue", fill="cornflowerblue")+
scale_fill_brewer(palette = "Greens")+
labs(x= " Country", y="Cost of Living Index", title="Top 10 Countries with Highest Cost of Living Index ")+
theme(axis.text.x = element_text(angle=60), text = element_text(size=14), plot.title = element_text(size=13))
Top 10 countries with the highest rent index
options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
arrange(desc(df$`Rent Index`)) %>%
slice(1:10) %>%
ggplot(., aes(x=reorder(Country, -`Rent Index`),y=`Rent Index`))+
geom_bar(stat='identity', color="skyblue", fill="royalblue", alpha=0.6)+
scale_fill_brewer(palette = "Greens")+
labs(x= "Country", y="Rent Index", title="Top 10 Countries with Highest Rent Index ")+
theme(axis.text.x = element_text(angle=60), text = element_text(size=14), plot.title = element_text(size=13))
Correlation Matrix
library(psych)
df_cor <- cor(select_if(df, is.numeric))
summary(df_cor)
## Cost of Living Index Rent Index Cost of Living Plus Rent Index
## Min. :0.6872 Min. :0.6381 Min. :0.6953
## 1st Qu.:0.8622 1st Qu.:0.7994 1st Qu.:0.9227
## Median :0.9500 Median :0.8191 Median :0.9379
## Mean :0.9000 Mean :0.8357 Mean :0.9106
## 3rd Qu.:0.9724 3rd Qu.:0.9136 3rd Qu.:0.9657
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Groceries Index Restaurant Price Index Local Purchasing Power Index
## Min. :0.6265 Min. :0.6908 Min. :0.6265
## 1st Qu.:0.8122 1st Qu.:0.8130 1st Qu.:0.6504
## Median :0.8942 Median :0.8847 Median :0.6890
## Mean :0.8632 Mean :0.8658 Mean :0.7230
## 3rd Qu.:0.9583 3rd Qu.:0.9303 3rd Qu.:0.6942
## Max. :1.0000 Max. :1.0000 Max. :1.0000
options(repr.plot.width = 14, repr.plot.height = 8)
corrplot::corrplot(df_cor,method="color", tl.cex=0.75, tl.col = "black")
- All variable are positively correlated to each other. - Cost of living
index tends to be highly positively correlated with cost of living and
rent, grocery, and restaurant price indices
Regression model
m <- lm(df$`Cost of Living Index`~., data=select_if(df, is.numeric))
summary(m)
##
## Call:
## lm(formula = df$`Cost of Living Index` ~ ., data = select_if(df,
## is.numeric))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0138766 -0.0054135 0.0007739 0.0050219 0.0155829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.472e-03 2.086e-03 -0.706 0.4815
## `Rent Index` -8.826e-01 2.004e-04 -4403.822 <2e-16 ***
## `Cost of Living Plus Rent Index` 1.883e+00 3.714e-04 5069.842 <2e-16 ***
## `Groceries Index` -1.593e-04 1.298e-04 -1.227 0.2220
## `Restaurant Price Index` -1.603e-04 8.001e-05 -2.003 0.0472 *
## `Local Purchasing Power Index` 2.756e-05 3.120e-05 0.883 0.3787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.006926 on 133 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.504e+08 on 5 and 133 DF, p-value: < 2.2e-16
Interpretation
Plot of p-values
#Extraction
pvalue <- data.frame(summary(m)$coefficients[,c('Pr(>|t|)', 'Estimate')])
pvalue$var_name <- rownames(pvalue)
colnames(pvalue) <-c('p-value', 'coefficient','variables')
View(pvalue)
#Plot
options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(pvalue, aes(x=reorder(`variables`, `p-value`), y=`p-value`))+
geom_col(fill='cornflowerblue')+
labs(title="P-value Factors on Cost of Living Index", x= " ")+
theme(axis.text.x = element_text(angle=90), plot.title = element_text(hjust=0.5, face="bold"))+
geom_hline(yintercept = 0.05, col="red")+
geom_text(aes(x=2,y=0.08), label="0.05", size=4.0, color="red")
How does cost of living index vary across continents?
df$Continent <- countrycode(sourcevar = df$Country, origin = "country.name",destination = "region")
df_continent <- df %>% select(`Cost of Living Index`,`Continent`) %>%
group_by(Continent) %>%
summarise(Mean_Cost_of_Living_Index = mean(`Cost of Living Index`))
options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(df_continent, aes(x=reorder(`Continent`,-`Mean_Cost_of_Living_Index`), y=`Mean_Cost_of_Living_Index`))+
geom_col(color="Skyblue", fill="cornflowerblue", alpha=0.9)+
scale_fill_brewer(palette = "Paired")+
theme(axis.text.x = element_text(angle=90))+
labs(y="Mean Cost of Living Index",, x="Continental Region", title="Mean Living Cost Index by Continental Region")
ggplot(df_continent, aes(x=`Continent`, y=`Mean_Cost_of_Living_Index`, fill=`Continent`))+
geom_bar(width=1,
stat="identity",
color="white",
alpha=.6)+
coord_polar("x", start=20)+
theme(axis.text.x=element_blank(),
legend.position = "right",
legend.text = element_text(size=12),
legend.title=element_blank(),
axis.text.y = element_text(size=12),
plot.title=element_text(size=12, hjust=1, face="bold"))+
labs(x=" ", y=" ",
title = "Mean Cost of Living Index by Continental Region")+
scale_fill_brewer(palette = "Set3")
Let’s visualize the cost of living index across countries and continents
install.packages(c("rworldmap", "ggplot2", "sf", "scales", "rnaturalearth", "rnaturalearthdata", "plotly"),repos = "http://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/8b/ht4__40d6qdcxj1qqfwjdb9h0000gn/T//Rtmpkvvml0/downloaded_packages
library(ggplot2)
library(sf)
library(rworldmap)
library(plotly)
Cost of Living Index
options(repr.plot.width = 14, repr.plot.height = 8)
index.map <- joinCountryData2Map(df, joinCode = "NAME", nameJoinColumn = "Country")
## 137 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 106 codes from the map weren't represented in your data
par(mar=c(0,0,1,0))
mapCountryData(index.map, nameColumnToPlot="Cost of Living Index", colourPalette = "diverging")
** Interactive Map: Cost of Living Index**
df1 <- df %>% select(`Country`, `Cost of Living Index`)
colnames(df1) <- c("region", "Cost_of_Living_Index")
mapdata <- map_data("world")
mapdata <- right_join(mapdata, df1, by="region")
options(repr.plot.width = 14, repr.plot.height = 8)
map1 <- ggplot(mapdata, aes(x= long, y= lat, group=group))+
geom_polygon(aes(fill = Cost_of_Living_Index), color="black")+
scale_fill_gradient(name="Cost of Living Index", low="yellow", high="red", na.value = "gray50")+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
map2 <- ggplotly(map1) %>%
highlight(
"plotly_hover",
selected = attrs_selected(line = list(color = "black")))
map2$layout$height <- 600
map2$layout$width <- 800
map2